setwd("~/sc_covid_PiB2023/data_science_project/Data") # path to data folder
dat <- read_h5ad("Pilot_2_rule_them_ALL.h5ad")
sc <- import('scanpy', convert = FALSE) # import scanpy as python module (by reticulate)
scvi <- import('scvi', convert = FALSE) # import scVI as python module (by reticulate)
scvi$settings$progress_bar_style = 'tqdm' # Set up a loading bar when running scVI
In cases where the data consists of fewer cells (5694) than genes (23382) scVI is expected to underfit the data, potentially leading to worse imputation accuracy. For scVI anywhere from 1.000 to 10.000 highly variable genes are recommended, but it will be context dependent. For now we choose 2000 HVGs.
To find HGVs we use the FindVariableFeatues function from the Seaurat package.
se_dat <- CreateSeuratObject(counts = t(dat$X)) # create Seurat object using expression matrix
nbf = 3000 # number of variable features we wish to select
se_dat <- FindVariableFeatures(se_dat, selection.method = "vst", nfeatures = nbf) # find number of variable features
Plot variable features:
top10 <- head(VariableFeatures(se_dat), 10) # We select 10 highest variable features
topnbf <- head(VariableFeatures(se_dat), nbf) # select variable features (gene names) to use in scVI
p1 <- VariableFeaturePlot(se_dat) # Create feature plot
p2 <- LabelPoints(plot = p1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0) # Overlay names of top 10 variable features
p2 # show plot
## Warning: Transformation introduced infinite values in continuous x-axis
sub_feature_dat <- dat[,topnbf] # Subset anndata object by selected variable features
Create and train model.
It should be noted that scvi models require the raw counts, but will run for non-negative real-valued data. Possible non-count values should be intended to represent pseudocounts, and not other normalized data, in which variance/covariance structure of the data has changes dramatically. Here we use normalized data.
sub_feature_dat = sub_feature_dat$copy()
# run setup_anndata
scvi$model$SCVI$setup_anndata(sub_feature_dat)
## None
# create model
model = scvi$model$SCVI(sub_feature_dat)
# train model
model$train()
# to specify the number of epochs when training:
# model$train(max_epochs = as.integer(400))
Returns 10 dimensional latent space.
Retrieve latent representation and visualize using UMAP and tSNE
latent = model$get_latent_representation()
latent <- data.matrix(latent)
The scVI latent space is saved onto the anndata object
dat$obsm$X_scVI <- latent # scVI latent space
write_h5ad(dat, '../Data/Pilot_2_rule_them_ALL.h5ad')
latent_UMAP <- RunUMAP(dat$obsm$X_scVI, n.components = 2, min.dist = 0.5, n.neighbors = 70) # run UMAP
## 20:35:16 UMAP embedding parameters a = 0.583 b = 1.334
## 20:35:16 Read 5694 rows and found 10 numeric columns
## 20:35:16 Using Annoy for neighbor search, n_neighbors = 70
## 20:35:16 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 20:35:16 Writing NN index file to temp file /scratch/18941268/Rtmpcdm90W/filee13c40c4912d
## 20:35:16 Searching Annoy index using 1 thread, search_k = 7000
## 20:35:20 Annoy recall = 100%
## 20:35:20 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 70
## 20:35:21 Initializing from normalized Laplacian + noise (using irlba)
## 20:35:21 Commencing optimization for 500 epochs, with 475120 positive edges
## 20:35:41 Optimization finished
latent_tSNE <- RunTSNE(dat$obsm$X_scVI, check_duplicates = TRUE, perplexity = 60, max_iter = 2000, theta = 0.2) # run tSNE
latent_df <- as.data.frame(dat$obs) # add annotations to data frame
latent_df$UMAP_1 <- latent_UMAP@cell.embeddings[,1] # add UMAP coordinates
latent_df$UMAP_2 <- latent_UMAP@cell.embeddings[,2]
latent_df$tSNE_1 <- latent_tSNE@cell.embeddings[,1] # add tSNE coordinates
latent_df$tSNE_2 <- latent_tSNE@cell.embeddings[,2]
cellType
p_UMAP_celltype <- latent_df %>%
arrange(viralLoad) %>%
ggplot(aes(x = UMAP_1, y = UMAP_2, color = cellType)) +
geom_point(alpha = 0.7) +
NULL
p_UMAP_celltype
Viral Load
latent_df %>%
arrange(viralLoad) %>%
ggplot(aes(x = UMAP_1, y = UMAP_2, color = viralLoad)) +
geom_point(alpha = 0.7) +
scale_color_gradient(low="blue", high="red") +
NULL
Celltype
latent_df %>%
arrange(viralLoad) %>%
ggplot(aes(x = tSNE_1, y = tSNE_2, color = cellType)) +
geom_point(alpha = 0.7) +
NULL
latent_df %>%
arrange(viralLoad) %>%
ggplot(aes(x = tSNE_1, y = tSNE_2, color = viralLoad)) +
geom_point(alpha = 0.7) +
scale_color_gradient(low="blue", high="red") +
NULL
When setting up the model it is possible to register batches and the subsequent model will correct for batch effects. Here we wish to correct for patient batch effect.
# run setup_anndata, use PatientID for batch
scvi$model$SCVI$setup_anndata(sub_feature_dat, batch_key = "PatientID")
## None
model_c = scvi$model$SCVI(sub_feature_dat) # create model
# train model
model_c$train()
# to specify the number of epochs when training:
# model$train(max_epochs = as.integer(400))
latent_c = model_c$get_latent_representation()
latent_c <- data.matrix(latent_c)
The scVI latent space is saved onto the anndata object
dat$obsm$X_scVI_corrected <- latent_c
# write_h5ad(dat, '../Data/Pilot_2_rule_them_ALL.h5ad')
cor_latent_UMAP <- RunUMAP(dat$obsm$X_scVI_corrected, n.components = 2, min.dist = 0.5, n.neighbors = 70)
## 20:37:02 UMAP embedding parameters a = 0.583 b = 1.334
## 20:37:02 Read 5694 rows and found 10 numeric columns
## 20:37:02 Using Annoy for neighbor search, n_neighbors = 70
## 20:37:02 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 20:37:02 Writing NN index file to temp file /scratch/18941268/Rtmpcdm90W/filee13c72b9f671
## 20:37:02 Searching Annoy index using 1 thread, search_k = 7000
## 20:37:06 Annoy recall = 100%
## 20:37:06 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 70
## 20:37:07 Initializing from normalized Laplacian + noise (using irlba)
## 20:37:07 Commencing optimization for 500 epochs, with 490478 positive edges
## 20:37:27 Optimization finished
## Warning: No assay specified, setting assay as RNA by default.
cor_latent_tSNE <- RunTSNE(dat$obsm$X_scVI_corrected, check_duplicates = TRUE, perplexity = 60, max_iter = 2000, theta = 0.2)
latent_df$cor_UMAP_1 <- cor_latent_UMAP@cell.embeddings[,1] # add UMAP coordinates
latent_df$cor_UMAP_2 <- cor_latent_UMAP@cell.embeddings[,2]
latent_df$cor_tSNE_1 <- cor_latent_tSNE@cell.embeddings[,1] # add tSNE coordinates
latent_df$cor_tSNE_2 <- cor_latent_tSNE@cell.embeddings[,2]
We compare the dispersion of PatientID in the latent before and after batch correction:
non_corrected <- latent_df %>%
ggplot(aes(x = UMAP_1, y = UMAP_2, color = PatientID)) +
geom_point(show.legend = FALSE, alpha = 0.7) +
ggtitle("Non-corrected")
corrected <- latent_df %>%
ggplot(aes(x = cor_UMAP_1, y = cor_UMAP_2, color = PatientID)) +
geom_point(alpha = 0.7) + ggtitle("Patient effect corrected")
non_corrected + corrected
Likewise we compare bact-corrected and non-batch corrected on celltype and viralload annotations to see if these structures are preserved and potentially have become more apparent in the latent space once patient batch effect is removed.
non_corrected <- latent_df %>%
ggplot(aes(x = UMAP_1, y = UMAP_2, color = cellType)) +
geom_point(show.legend = FALSE, alpha = 0.7) +
ggtitle("Non-corrected")
corrected <- latent_df %>%
ggplot(aes(x = cor_UMAP_1, y = cor_UMAP_2, color = cellType)) +
geom_point(alpha = 0.7) +
ggtitle("Patient effect corrected")
non_corrected + corrected
non_corrected <- latent_df %>%
arrange(viralLoad) %>%
ggplot(aes(x = UMAP_1, y = UMAP_2, color = viralLoad)) +
geom_point(show.legend = FALSE, alpha = 0.7) +
ggtitle("Non-corrected")+
scale_color_gradient(low="blue", high="red")
corrected <- latent_df %>%
arrange(viralLoad) %>%
ggplot(aes(x = cor_UMAP_1, y = cor_UMAP_2, color = viralLoad)) +
geom_point(alpha = 0.7) + ggtitle("Patient effect corrected") +
scale_color_gradient(low="blue", high="red")
non_corrected + corrected
The same visualizations have been repeated with tSNE.
non_corrected <- latent_df %>%
ggplot(aes(x = tSNE_1, y = tSNE_2, color = PatientID)) +
geom_point(show.legend = FALSE, alpha = 0.7) +
ggtitle("Non-corrected")
corrected <- latent_df %>%
ggplot(aes(x = cor_tSNE_1, y = cor_tSNE_2, color = PatientID)) +
geom_point(alpha = 0.7) + ggtitle("Patient effect corrected")
non_corrected + corrected
non_corrected <- latent_df %>%
ggplot(aes(x = tSNE_1, y = tSNE_2, color = cellType)) +
geom_point(show.legend = FALSE, alpha = 0.7) +
ggtitle("Non-corrected")
corrected <- latent_df %>%
ggplot(aes(x = cor_tSNE_1, y = cor_tSNE_2, color = cellType)) +
geom_point(alpha = 0.7) +
ggtitle("Patient effect corrected")
non_corrected + corrected
non_corrected <- latent_df %>%
arrange(viralLoad) %>%
ggplot(aes(x = tSNE_1, y = tSNE_2, color = viralLoad)) +
geom_point(show.legend = FALSE, alpha = 0.7) +
ggtitle("Non-corrected")+
scale_color_gradient(low="blue", high="red")
corrected <- latent_df %>%
arrange(viralLoad) %>%
ggplot(aes(x = cor_tSNE_1, y = cor_tSNE_2, color = viralLoad)) +
geom_point(alpha = 0.7) + ggtitle("Patient effect corrected") +
scale_color_gradient(low="blue", high="red")
non_corrected + corrected
An example use of scVI besides dimensionality reduction is for differential expression. Many differential expression models often use linear assumptions which may not capture complex batch effects, whereas deep generative models may not suffer from these problems by using complex nonlinear mappings.
We demonstrate a simple example of computing differential expression for cell type using scVI:
DE <- model$differential_expression(sub_feature_dat, groupby = "cellType", batch_correction = FALSE)
print(DE$head())
## proba_de proba_not_de ... group1 group2
## ETNPPL 0.8260 0.1740 ... Ciliated Rest
## CAMK4 0.8190 0.1810 ... Ciliated Rest
## CHAT 0.8170 0.1830 ... Ciliated Rest
## AL035078.1 0.8118 0.1882 ... Ciliated Rest
## IGKJ3 0.8108 0.1892 ... Ciliated Rest
##
## [5 rows x 22 columns]
It is also possible to use batch correction. Here we use batch corretion for patient.
DE <- model_c$differential_expression(sub_feature_dat, groupby = "cellType", batch_correction = TRUE)
print(DE$head())
## proba_de proba_not_de ... group1 group2
## TRAV12-2 0.8266 0.1734 ... Ciliated Rest
## FAP 0.8234 0.1766 ... Ciliated Rest
## PMP22 0.8176 0.1824 ... Ciliated Rest
## MEFV 0.8160 0.1840 ... Ciliated Rest
## MXD3 0.8158 0.1842 ... Ciliated Rest
##
## [5 rows x 22 columns]
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /home/anna7393/miniconda3/envs/scVI_env/lib/libopenblasp-r0.3.21.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] anndata_0.7.5.4 lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0
## [5] dplyr_1.1.2 purrr_1.0.1 readr_2.1.4 tidyr_1.3.0
## [9] tibble_3.2.1 ggplot2_3.4.2 tidyverse_2.0.0 reticulate_1.25
## [13] SeuratObject_4.1.3 Seurat_4.3.0
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.16 colorspace_2.1-0 deldir_1.0-9
## [4] ellipsis_0.3.2 ggridges_0.5.4 rprojroot_2.0.3
## [7] rstudioapi_0.14 spatstat.data_3.0-1 farver_2.1.1
## [10] leiden_0.4.3 listenv_0.9.0 ggrepel_0.9.3
## [13] fansi_1.0.4 codetools_0.2-19 splines_4.2.0
## [16] cachem_1.0.8 knitr_1.43 polyclip_1.10-4
## [19] jsonlite_1.8.4 ica_1.0-3 cluster_2.1.4
## [22] png_0.1-8 uwot_0.1.14 shiny_1.7.4
## [25] sctransform_0.3.5 spatstat.sparse_3.0-1 compiler_4.2.0
## [28] httr_1.4.6 assertthat_0.2.1 Matrix_1.5-4.1
## [31] fastmap_1.1.1 lazyeval_0.2.2 cli_3.6.1
## [34] later_1.3.1 htmltools_0.5.5 tools_4.2.0
## [37] igraph_1.3.5 gtable_0.3.3 glue_1.6.2
## [40] RANN_2.6.1 reshape2_1.4.4 Rcpp_1.0.10
## [43] scattermore_1.1 jquerylib_0.1.4 vctrs_0.6.2
## [46] nlme_3.1-162 spatstat.explore_3.2-1 progressr_0.13.0
## [49] lmtest_0.9-40 spatstat.random_3.1-5 xfun_0.39
## [52] globals_0.16.2 timechange_0.2.0 mime_0.12
## [55] miniUI_0.1.1.1 lifecycle_1.0.3 irlba_2.3.5.1
## [58] goftest_1.2-3 future_1.32.0 MASS_7.3-60
## [61] zoo_1.8-12 scales_1.2.1 hms_1.1.3
## [64] promises_1.2.0.1 spatstat.utils_3.0-3 parallel_4.2.0
## [67] RColorBrewer_1.1-3 yaml_2.3.7 pbapply_1.7-0
## [70] gridExtra_2.3 sass_0.4.6 stringi_1.7.6
## [73] highr_0.10 rlang_1.1.1 pkgconfig_2.0.3
## [76] matrixStats_0.63.0 evaluate_0.21 lattice_0.21-8
## [79] ROCR_1.0-11 tensor_1.5 labeling_0.4.2
## [82] patchwork_1.1.2 htmlwidgets_1.6.2 cowplot_1.1.1
## [85] tidyselect_1.2.0 here_1.0.1 parallelly_1.36.0
## [88] RcppAnnoy_0.0.20 plyr_1.8.8 magrittr_2.0.3
## [91] R6_2.5.1 generics_0.1.3 withr_2.5.0
## [94] pillar_1.9.0 fitdistrplus_1.1-11 survival_3.5-5
## [97] abind_1.4-5 sp_1.6-0 future.apply_1.11.0
## [100] KernSmooth_2.23-21 utf8_1.2.3 spatstat.geom_3.2-1
## [103] plotly_4.10.1 tzdb_0.4.0 rmarkdown_2.14
## [106] grid_4.2.0 data.table_1.14.8 digest_0.6.31
## [109] xtable_1.8-4 httpuv_1.6.11 munsell_0.5.0
## [112] viridisLite_0.4.1 bslib_0.4.2